1 Summary of data manipulation

2 Load packages and data set

if (!require("plotly")) install.packages("plotly")
## Loading required package: plotly
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
if (!require("gridExtra")) install.packages("gridExtra")
## Loading required package: gridExtra
if (!require("purrr")) install.packages("purrr")
## Loading required package: purrr
if (!require("Hmisc")) install.packages("Hmisc")
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
## 
##     subplot
## The following objects are masked from 'package:base':
## 
##     format.pval, units
if (!require("naniar")) install.packages("naniar")
## Loading required package: naniar
if (!require("lubridate")) install.packages("lubridate")
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
if (!require("forcats")) install.packages("forcats")
## Loading required package: forcats
if (!require("skimr")) install.packages("skimr")
## Loading required package: skimr
## 
## Attaching package: 'skimr'
## The following object is masked from 'package:naniar':
## 
##     n_complete
## The following object is masked from 'package:stats':
## 
##     filter
if (!require("stats")) install.packages("stats")
if (!require("scales")) install.packages("scales")
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
library(plotly)
library(gridExtra)
library(purrr)
library(Hmisc)
library(naniar)
library(lubridate)
library(forcats)
library(skimr)
library(stats)
library(scales)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  2.1.3     ✔ readr   1.3.1
## ✔ tidyr   0.8.3     ✔ dplyr   0.8.3
## ✔ tibble  2.1.3     ✔ stringr 1.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ readr::col_factor()      masks scales::col_factor()
## ✖ dplyr::combine()         masks gridExtra::combine()
## ✖ lubridate::date()        masks base::date()
## ✖ scales::discard()        masks purrr::discard()
## ✖ dplyr::filter()          masks skimr::filter(), plotly::filter(), stats::filter()
## ✖ lubridate::intersect()   masks base::intersect()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ lubridate::setdiff()     masks base::setdiff()
## ✖ dplyr::src()             masks Hmisc::src()
## ✖ dplyr::summarize()       masks Hmisc::summarize()
## ✖ lubridate::union()       masks base::union()
adoption <- read_csv("adoption_final.csv")
## Parsed with column specification:
## cols(
##   Month = col_character(),
##   Customer_ID = col_character(),
##   Customer_size = col_character(),
##   Geo_Code = col_character(),
##   Visualize = col_character(),
##   Alert = col_character(),
##   Report = col_character()
## )
april <- read_csv("april_billing_final.csv")
## Parsed with column specification:
## cols(
##   Billing_month = col_datetime(format = ""),
##   Customer_ID = col_character(),
##   Registration_date = col_date(format = ""),
##   product_id = col_double(),
##   Billed_amount = col_double()
## )
june <- read_csv("june_billing_final.csv")
## Parsed with column specification:
## cols(
##   Billing_month = col_datetime(format = ""),
##   Customer_ID = col_character(),
##   Registration_date = col_date(format = ""),
##   product_id = col_double(),
##   Billed_amount = col_double()
## )
may <- read_csv("may_billing_final.csv")
## Parsed with column specification:
## cols(
##   Billing_month = col_datetime(format = ""),
##   Customer_ID = col_character(),
##   Registration_date = col_date(format = ""),
##   product_id = col_double(),
##   Billed_amount = col_double()
## )

3 Invidual tables exploration

3.1 Adoption

  • April: 45,604

  • May: 47,272

  • June: 47,149

  • Missing 16 GEO-code

  • 140,025 observation –> remove 16 missing values in GEO-code

describe(adoption)
## adoption 
## 
##  7  Variables      140025  Observations
## ---------------------------------------------------------------------------
## Month 
##        n  missing distinct 
##   140025        0        3 
##                                      
## Value      April-19  June-19   May-19
## Frequency     45604    47149    47272
## Proportion    0.326    0.337    0.338
## ---------------------------------------------------------------------------
## Customer_ID 
##        n  missing distinct 
##   140025        0    52829 
## 
## lowest : 004676561936772 00a790034978456 00b681334781982 00D351757638125 00F721755330737
## highest: zzX629055216030 ZzX879532807956 zZy662437699323 zzy718159275481 Zzy795772531160
## ---------------------------------------------------------------------------
## Customer_size 
##        n  missing distinct 
##   140025        0        3 
##                                
## Value       Large    Mid  Small
## Frequency     392   1714 137919
## Proportion  0.003  0.012  0.985
## ---------------------------------------------------------------------------
## Geo_Code 
##        n  missing distinct 
##   140009       16        7 
##                                                                   
## Value               AMER          APAC          CHNA          EMEA
## Frequency          71874         17766          6388         30877
## Proportion         0.513         0.127         0.046         0.221
##                                                     
## Value      GEO-UNCLAIMED          GLBL          JAPN
## Frequency             47           634         12423
## Proportion         0.000         0.005         0.089
## ---------------------------------------------------------------------------
## Visualize 
##        n  missing distinct 
##   140025        0        2 
##                         
## Value          No    Yes
## Frequency  104168  35857
## Proportion  0.744  0.256
## ---------------------------------------------------------------------------
## Alert 
##        n  missing distinct 
##   140025        0        2 
##                         
## Value          No    Yes
## Frequency  131290   8735
## Proportion  0.938  0.062
## ---------------------------------------------------------------------------
## Report 
##        n  missing distinct 
##   140025        0        2 
##                         
## Value          No    Yes
## Frequency  131927   8098
## Proportion  0.942  0.058
## ---------------------------------------------------------------------------

3.1.1 Clean data

#Show rows wiht missing value
adopt_missing <- adoption[is.na(adoption$Geo_Code),]

adopt_missing
## # A tibble: 16 x 7
##    Month    Customer_ID     Customer_size Geo_Code Visualize Alert Report
##    <chr>    <chr>           <chr>         <chr>    <chr>     <chr> <chr> 
##  1 April-19 3ve622950616356 Small         <NA>     No        No    No    
##  2 May-19   3ve622950616356 Small         <NA>     No        No    No    
##  3 June-19  3ve622950616356 Small         <NA>     No        No    No    
##  4 April-19 gMS335204256682 Small         <NA>     No        No    No    
##  5 May-19   gMS335204256682 Mid           <NA>     No        No    No    
##  6 June-19  gMS335204256682 Mid           <NA>     No        No    No    
##  7 April-19 Dyu886342809534 Small         <NA>     No        No    No    
##  8 May-19   Dyu886342809534 Small         <NA>     No        No    No    
##  9 June-19  Dyu886342809534 Small         <NA>     No        No    No    
## 10 June-19  4Yq076930086263 Small         <NA>     No        No    No    
## 11 June-19  sNr024265486996 Small         <NA>     No        No    No    
## 12 June-19  uLn203194071015 Small         <NA>     No        No    No    
## 13 June-19  f2P113951421659 Small         <NA>     No        No    No    
## 14 June-19  leE241742919990 Small         <NA>     No        No    No    
## 15 June-19  EjH888820866536 Small         <NA>     No        No    No    
## 16 June-19  JWE850603294351 Small         <NA>     No        No    No
unique(adopt_missing$Customer_ID)
##  [1] "3ve622950616356" "gMS335204256682" "Dyu886342809534"
##  [4] "4Yq076930086263" "sNr024265486996" "uLn203194071015"
##  [7] "f2P113951421659" "leE241742919990" "EjH888820866536"
## [10] "JWE850603294351"
length(unique(adopt_missing$Customer_ID)) # 10 id with missing geo_code
## [1] 10
#Filter out these 16 rows
adoption <- adoption %>% filter(!is.na(Geo_Code ))

3.1.2 Customer size distribution

adoption %>% 
  group_by(Customer_size) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Customer_size, pct), y = pct)) + geom_bar(stat='identity') + 
  scale_y_continuous(labels = scales::percent) + 
  geom_text(aes(label=paste(count), y=pct+0.025), size=4)

3.1.3 Geo location

adoption %>% 
  group_by(Geo_Code) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Geo_Code, pct), y = pct)) + geom_bar(stat='identity') + 
  scale_y_continuous(labels = scales::percent) + 
  geom_text(aes(label=paste(count), y=pct+0.025), size=4)

##+ Service

grid.arrange(
adoption %>% 
  group_by(Visualize) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Visualize, pct), y = pct)) + geom_bar(stat='identity') + 
  scale_y_continuous(labels = scales::percent) + 
  geom_text(aes(label=paste(count), y=pct+0.025), size=4),
adoption %>% 
  group_by(Alert) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Alert, pct), y = pct)) + geom_bar(stat='identity') + 
  scale_y_continuous(labels = scales::percent) + 
  geom_text(aes(label=paste(count), y=pct+0.025), size=4),
adoption %>% 
  group_by(Report) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Report, pct), y = pct)) + geom_bar(stat='identity') + 
  scale_y_continuous(labels = scales::percent) + 
  geom_text(aes(label=paste(count), y=pct+0.025), size=4),ncol = 3)

3.1.3.1 Service by Customer size

grid.arrange(
adoption %>% 
  group_by(Customer_size, Visualize) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Customer_size, pct), y = pct, fill = Visualize)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent),
adoption %>% 
  group_by(Customer_size,Alert) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Customer_size, pct), y = pct, fill = Alert)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent) ,
adoption %>% 
  group_by(Customer_size,Report) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Customer_size, pct), y = pct, fill = Report)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent),ncol = 1)

3.1.3.2 Service by GEO

grid.arrange(
adoption %>% 
  group_by(Geo_Code, Visualize) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Geo_Code, pct), y = pct, fill = Visualize)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent),
adoption %>% 
  group_by(Geo_Code,Alert) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Geo_Code, pct), y = pct, fill = Alert)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent) ,
adoption %>% 
  group_by(Geo_Code,Report) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Geo_Code, pct), y = pct, fill = Report)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent),ncol = 1)

3.1.4 Add service use indication

# Create adoption-category data

VAR <- adoption %>% 
             filter(grepl('Yes', Visualize) & grepl('Yes', Alert) & grepl('Yes', Report)) %>%
             mutate(Tools = "VAR")

VA <- adoption %>% 
             filter(grepl('Yes', Visualize) & grepl('Yes', Alert) & grepl('No', Report)) %>%
             mutate(Tools = "VA")

VR <- adoption %>% 
             filter(grepl('Yes', Visualize) & grepl('No', Alert) & grepl('Yes', Report)) %>%
             mutate(Tools = "VR")

AR <- adoption %>% 
             filter(grepl('No', Visualize) & grepl('Yes', Alert) & grepl('Yes', Report)) %>%
             mutate(Tools = "AR")

V <- adoption %>% 
             filter(grepl('Yes', Visualize) & grepl('No', Alert) & grepl('No', Report)) %>%
             mutate(Tools = "V")

A <- adoption %>% 
             filter(grepl('No', Visualize) & grepl('Yes', Alert) & grepl('No', Report)) %>%
             mutate(Tools = "A")

R <- adoption %>% 
             filter(grepl('No', Visualize) & grepl('No', Alert) & grepl('Yes', Report)) %>%
             mutate(Tools = "R")

N <- adoption %>% 
             filter(grepl('No', Visualize) & grepl('No', Alert) & grepl('No', Report)) %>%
             mutate(Tools = "N")

adoption<- bind_rows(VAR, VA, AR, VR,
                    V, A, R, N) %>%  mutate(Tools = as.factor(Tools))

3.2 April

describe(april)
## april 
## 
##  5  Variables      528806  Observations
## ---------------------------------------------------------------------------
## Billing_month 
##          n    missing   distinct       Info       Mean        Gmd 
##     528806          0          1          0 2019-04-01 1970-01-01 
##                      
## Value      1554076800
## Frequency      528806
## Proportion          1
## ---------------------------------------------------------------------------
## Customer_ID 
##        n  missing distinct 
##   528806        0    45876 
## 
## lowest : 004676561936772 00a790034978456 00b681334781982 00D351757638125 00F721755330737
## highest: ZZx567245693938 zzX629055216030 zZy662437699323 zzy718159275481 Zzy795772531160
## ---------------------------------------------------------------------------
## Registration_date 
##        n  missing distinct 
##   528806        0     4279 
## 
## lowest : 2004-08-21 2004-08-22 2004-09-02 2004-09-18 2004-10-04
## highest: 2019-04-22 2019-04-23 2019-04-24 2019-04-25 2019-04-26
## ---------------------------------------------------------------------------
## product_id 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   506854    21952     2177    0.997     7735     9009        8       10 
##      .25      .50      .75      .90      .95 
##      938     3299    11921    17724    25460 
## 
## lowest :      2      7      8     10     12, highest: 104054 104099 104149 104166 104398
## ---------------------------------------------------------------------------
## Billed_amount 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   528806        0   294652    0.973    404.7      733    0.000    0.000 
##      .25      .50      .75      .90      .95 
##    0.000    2.137  108.000  911.703 1798.050 
## 
## lowest : -16007.837 -10051.695  -4266.879  -4094.760  -3978.232
## highest: 297159.973 367635.247 412586.133 414781.007 854206.203
## ---------------------------------------------------------------------------
april %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram(bins = 100)
## Warning: Removed 21952 rows containing non-finite values (stat_bin).

3.2.1 Missing value in product_id means billed_amount = 0

  • 21,952 missing values in product_id
  • Solution: Assign 0 to missing product_id
  • Make all product_id discrete variable (even though they are number, they are names of products)
#Show missing values 
april[is.na(april$product_id),]
## # A tibble: 21,952 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-04-01 00:00:00 0046765619… 2015-06-18               NA
##  2 2019-04-01 00:00:00 00F7217553… 2012-01-16               NA
##  3 2019-04-01 00:00:00 00R0968053… 2012-01-30               NA
##  4 2019-04-01 00:00:00 00S8151738… 2016-02-04               NA
##  5 2019-04-01 00:00:00 00X7404388… 2017-06-01               NA
##  6 2019-04-01 00:00:00 0180506686… 2018-10-15               NA
##  7 2019-04-01 00:00:00 0185262367… 2007-10-20               NA
##  8 2019-04-01 00:00:00 01J1125053… 2015-11-14               NA
##  9 2019-04-01 00:00:00 01O6937090… 2015-05-28               NA
## 10 2019-04-01 00:00:00 01R0858474… 2017-11-29               NA
## # … with 21,942 more rows, and 1 more variable: Billed_amount <dbl>
#Assign missing value as 0
#april$product_id[is.na(april$product_id)] <- 0

3.2.2 Registration Date

range(april$Registration_date)
## [1] "2004-08-21" "2019-04-26"
april %>% ggplot(aes(x = Registration_date ))+ geom_histogram(bins = 17)

3.2.3 Billed Amount

  • highly skewed to 0
summary(april$Billed_amount)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -16007.8      0.0      2.1    404.7    108.0 854206.2
april %>% ggplot(aes(y = Billed_amount)) + geom_boxplot()

april[april$Billed_amount <0,] #133 negative billed amount 
## # A tibble: 133 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-04-01 00:00:00 07d8117843… 2016-06-16             1925
##  2 2019-04-01 00:00:00 0gL3474882… 2017-12-02             1925
##  3 2019-04-01 00:00:00 0hT1568119… 2018-08-10             1925
##  4 2019-04-01 00:00:00 1R30645204… 2017-08-02             1925
##  5 2019-04-01 00:00:00 1r38382634… 2017-11-06             1925
##  6 2019-04-01 00:00:00 2WN0679333… 2019-01-08             1925
##  7 2019-04-01 00:00:00 2bX4449667… 2015-07-28             1925
##  8 2019-04-01 00:00:00 2bx8772712… 2015-12-14             1925
##  9 2019-04-01 00:00:00 2iL2964594… 2018-01-18             1925
## 10 2019-04-01 00:00:00 2jj1545435… 2014-02-09             1925
## # … with 123 more rows, and 1 more variable: Billed_amount <dbl>
april[april$Billed_amount == 0,] # 157k have 0 billed amount 
## # A tibble: 157,667 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-04-01 00:00:00 0046765619… 2015-06-18             1958
##  2 2019-04-01 00:00:00 0046765619… 2015-06-18            11813
##  3 2019-04-01 00:00:00 0046765619… 2015-06-18               NA
##  4 2019-04-01 00:00:00 00D3517576… 2009-06-16               91
##  5 2019-04-01 00:00:00 00D3517576… 2009-06-16            10298
##  6 2019-04-01 00:00:00 00D3517576… 2009-06-16            13350
##  7 2019-04-01 00:00:00 00F7217553… 2012-01-16            11813
##  8 2019-04-01 00:00:00 00F7217553… 2012-01-16            20597
##  9 2019-04-01 00:00:00 00F7217553… 2012-01-16               NA
## 10 2019-04-01 00:00:00 00R0968053… 2012-01-30            10298
## # … with 157,657 more rows, and 1 more variable: Billed_amount <dbl>
april[april$Billed_amount > 1000,] 
## # A tibble: 49,001 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-04-01 00:00:00 0046765619… 2015-06-18               10
##  2 2019-04-01 00:00:00 00F7217553… 2012-01-16               10
##  3 2019-04-01 00:00:00 00F7217553… 2012-01-16              938
##  4 2019-04-01 00:00:00 00R0968053… 2012-01-30               10
##  5 2019-04-01 00:00:00 00R0968053… 2012-01-30              938
##  6 2019-04-01 00:00:00 00S6153818… 2018-09-11               10
##  7 2019-04-01 00:00:00 00S8151738… 2016-02-04               10
##  8 2019-04-01 00:00:00 00T0630891… 2018-02-14               10
##  9 2019-04-01 00:00:00 00T4181469… 2016-02-12               10
## 10 2019-04-01 00:00:00 00X7404388… 2017-06-01                8
## # … with 48,991 more rows, and 1 more variable: Billed_amount <dbl>
#Visualize smaller range of billed_amount
test_bill <- april[april$Billed_amount>0 & april$Billed_amount<1500,]
test_bill %>% ggplot(aes(Billed_amount)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3.3 May

describe(may)
## may 
## 
##  5  Variables      552263  Observations
## ---------------------------------------------------------------------------
## Billing_month 
##          n    missing   distinct       Info       Mean        Gmd 
##     552263          0          1          0 2019-05-01 1970-01-01 
##                      
## Value      1556668800
## Frequency      552263
## Proportion          1
## ---------------------------------------------------------------------------
## Customer_ID 
##        n  missing distinct 
##   552263        0    47516 
## 
## lowest : 004676561936772 00a790034978456 00b681334781982 00F721755330737 00n198760607655
## highest: zZX252173851580 ZZx567245693938 ZzX879532807956 zZy662437699323 Zzy795772531160
## ---------------------------------------------------------------------------
## Registration_date 
##        n  missing distinct 
##   552263        0     4311 
## 
## lowest : 2004-08-21 2004-08-22 2004-09-02 2004-09-11 2004-09-18
## highest: 2019-05-22 2019-05-23 2019-05-24 2019-05-27 2019-05-29
## ---------------------------------------------------------------------------
## product_id 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   529250    23013     2262    0.997     7921     9244        8       10 
##      .25      .50      .75      .90      .95 
##      938     3299    11921    18651    25460 
## 
## lowest :      2      7      8     10     12, highest: 111221 111223 111227 111433 111434
## ---------------------------------------------------------------------------
## Billed_amount 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   552263        0   307740    0.973    408.4    740.1    0.000    0.000 
##      .25      .50      .75      .90      .95 
##    0.000    2.088  108.624  915.271 1811.600 
## 
## lowest : -17555.206  -7631.000  -6906.177  -5038.969  -4901.200
## highest: 287151.208 417530.563 424886.635 449439.475 900500.004
## ---------------------------------------------------------------------------
may %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram(bins = 100)
## Warning: Removed 23013 rows containing non-finite values (stat_bin).

3.3.1 Missing value in product_id means billed_amount = 0

  • 23,013 missing values in product_id
  • Solution: Assign 0 to missing product_id
  • Make all product_id discrete variable (even though they are number, they are names of products)
#Show missing values 
may[is.na(may$product_id),]
## # A tibble: 23,013 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-05-01 00:00:00 0046765619… 2015-06-18               NA
##  2 2019-05-01 00:00:00 00F7217553… 2012-01-16               NA
##  3 2019-05-01 00:00:00 00R0968053… 2012-01-30               NA
##  4 2019-05-01 00:00:00 00S8151738… 2016-02-04               NA
##  5 2019-05-01 00:00:00 00X7404388… 2017-06-01               NA
##  6 2019-05-01 00:00:00 0180506686… 2018-10-15               NA
##  7 2019-05-01 00:00:00 0185262367… 2007-10-20               NA
##  8 2019-05-01 00:00:00 01J1125053… 2015-11-14               NA
##  9 2019-05-01 00:00:00 01O6937090… 2015-05-28               NA
## 10 2019-05-01 00:00:00 01R0858474… 2017-11-29               NA
## # … with 23,003 more rows, and 1 more variable: Billed_amount <dbl>
#Assign missing value as 0
#may$product_id[is.na(may$product_id)] <- 0

3.3.2 Registration Date

range(may$Registration_date)
## [1] "2004-08-21" "2019-05-29"
may %>% ggplot(aes(x = Registration_date ))+ geom_histogram(bins = 17)

3.3.3 Billed Amount

  • highly skewed to 0
summary(may$Billed_amount)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -17555.2      0.0      2.1    408.4    108.6 900500.0
may %>% ggplot(aes(y = Billed_amount)) + geom_boxplot()

may[may$Billed_amount <0,] #124 negative billed amount 
## # A tibble: 124 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-05-01 00:00:00 0gL3474882… 2017-12-02             1925
##  2 2019-05-01 00:00:00 0hT1568119… 2018-08-10             1925
##  3 2019-05-01 00:00:00 1CD7032740… 2014-11-18             1925
##  4 2019-05-01 00:00:00 1ff8916555… 2019-02-19             1925
##  5 2019-05-01 00:00:00 1y80992059… 2017-02-20             1925
##  6 2019-05-01 00:00:00 2bX4449667… 2015-07-28             1925
##  7 2019-05-01 00:00:00 2iL2964594… 2018-01-18             1925
##  8 2019-05-01 00:00:00 2m02498354… 2011-08-23             1925
##  9 2019-05-01 00:00:00 31O5385888… 2014-05-27             1925
## 10 2019-05-01 00:00:00 3Tn6704020… 2012-12-19             1925
## # … with 114 more rows, and 1 more variable: Billed_amount <dbl>
may[may$Billed_amount == 0,] # 165k have 0 billed amount 
## # A tibble: 165,875 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-05-01 00:00:00 0046765619… 2015-06-18             1958
##  2 2019-05-01 00:00:00 0046765619… 2015-06-18             2155
##  3 2019-05-01 00:00:00 0046765619… 2015-06-18            11813
##  4 2019-05-01 00:00:00 0046765619… 2015-06-18            26720
##  5 2019-05-01 00:00:00 0046765619… 2015-06-18               NA
##  6 2019-05-01 00:00:00 00F7217553… 2012-01-16            11813
##  7 2019-05-01 00:00:00 00F7217553… 2012-01-16            20597
##  8 2019-05-01 00:00:00 00F7217553… 2012-01-16               NA
##  9 2019-05-01 00:00:00 00R0968053… 2012-01-30            10298
## 10 2019-05-01 00:00:00 00R0968053… 2012-01-30               NA
## # … with 165,865 more rows, and 1 more variable: Billed_amount <dbl>
may[may$Billed_amount > 1000,] #51k have bill > 1000
## # A tibble: 51,373 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-05-01 00:00:00 0046765619… 2015-06-18               10
##  2 2019-05-01 00:00:00 00F7217553… 2012-01-16               10
##  3 2019-05-01 00:00:00 00F7217553… 2012-01-16              938
##  4 2019-05-01 00:00:00 00R0968053… 2012-01-30               10
##  5 2019-05-01 00:00:00 00R0968053… 2012-01-30              938
##  6 2019-05-01 00:00:00 00S6153818… 2018-09-11               10
##  7 2019-05-01 00:00:00 00S8151738… 2016-02-04               10
##  8 2019-05-01 00:00:00 00T0630891… 2018-02-14               10
##  9 2019-05-01 00:00:00 00T4181469… 2016-02-12               10
## 10 2019-05-01 00:00:00 00X7404388… 2017-06-01                8
## # … with 51,363 more rows, and 1 more variable: Billed_amount <dbl>
#Visualize smaller range of billed_amount
test_bill_may <- may[may$Billed_amount>0 & may$Billed_amount<1500,]
test_bill_may %>% ggplot(aes(Billed_amount)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3.4 June

describe(june)
## june 
## 
##  5  Variables      556261  Observations
## ---------------------------------------------------------------------------
## Billing_month 
##          n    missing   distinct       Info       Mean        Gmd 
##     556261          0          1          0 2019-06-01 1970-01-01 
##                      
## Value      1559347200
## Frequency      556261
## Proportion          1
## ---------------------------------------------------------------------------
## Customer_ID 
##        n  missing distinct 
##   556261        0    47442 
## 
## lowest : 004676561936772 00a790034978456 00b681334781982 00F721755330737 00n198760607655
## highest: ZZw803293842918 zZX252173851580 ZZx567245693938 zZy662437699323 Zzy795772531160
## ---------------------------------------------------------------------------
## Registration_date 
##        n  missing distinct 
##   556261        0     4332 
## 
## lowest : 2004-08-21 2004-08-22 2004-09-02 2004-09-11 2004-09-18
## highest: 2019-06-23 2019-06-24 2019-06-27 2019-06-29 2019-06-30
## ---------------------------------------------------------------------------
## product_id 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   533283    22978     2300    0.997     8065     9421        8       10 
##      .25      .50      .75      .90      .95 
##      938     3299    11921    18651    25460 
## 
## lowest :      2      7      8     10     12, highest: 116105 116111 116114 116531 116533
## ---------------------------------------------------------------------------
## Billed_amount 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##   556261        0   310357    0.974      407    737.9     0.00     0.00 
##      .25      .50      .75      .90      .95 
##     0.00     2.17   108.00   902.72  1794.85 
## 
## lowest : -17894.279  -6223.696  -4109.864  -3840.000  -3741.370
## highest: 376950.934 385215.775 417226.326 445296.786 919030.922
## ---------------------------------------------------------------------------
june %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram(bins = 100)
## Warning: Removed 22978 rows containing non-finite values (stat_bin).

3.4.1 Missing value in product_id means billed_amount = 0

  • 22,978 missing values in product_id
  • Solution: Assign 0 to missing product_id
  • Make all product_id discrete variable (even though they are number, they are names of products)
#Show missing values 
june[is.na(june$product_id),]
## # A tibble: 22,978 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-06-01 00:00:00 0046765619… 2015-06-18               NA
##  2 2019-06-01 00:00:00 00F7217553… 2012-01-16               NA
##  3 2019-06-01 00:00:00 00R0968053… 2012-01-30               NA
##  4 2019-06-01 00:00:00 00S8151738… 2016-02-04               NA
##  5 2019-06-01 00:00:00 00X7404388… 2017-06-01               NA
##  6 2019-06-01 00:00:00 0180506686… 2018-10-15               NA
##  7 2019-06-01 00:00:00 0185262367… 2007-10-20               NA
##  8 2019-06-01 00:00:00 01J1125053… 2015-11-14               NA
##  9 2019-06-01 00:00:00 01O6937090… 2015-05-28               NA
## 10 2019-06-01 00:00:00 01R0858474… 2017-11-29               NA
## # … with 22,968 more rows, and 1 more variable: Billed_amount <dbl>
#Assign missing value as 0
#june$product_id[is.na(june$product_id)] <- 0

3.4.2 Registration Date

range(june$Registration_date)
## [1] "2004-08-21" "2019-06-30"
june %>% ggplot(aes(x = Registration_date ))+ geom_histogram(bins = 17)

3.4.3 Billed Amount

  • highly skewed to 0
summary(june$Billed_amount)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -17894.3      0.0      2.2    407.0    108.0 919030.9
june %>% ggplot(aes(y = Billed_amount)) + geom_boxplot()

june[june$Billed_amount <0,] #117 negative billed amount 
## # A tibble: 117 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-06-01 00:00:00 0gL3474882… 2017-12-02             1925
##  2 2019-06-01 00:00:00 0hT1568119… 2018-08-10             1925
##  3 2019-06-01 00:00:00 16P9061271… 2015-11-30             1925
##  4 2019-06-01 00:00:00 1HA8154862… 2017-06-16             1925
##  5 2019-06-01 00:00:00 1Xb0305152… 2018-09-24             1925
##  6 2019-06-01 00:00:00 1y80992059… 2017-02-20             1925
##  7 2019-06-01 00:00:00 1zn1806137… 2008-10-10             1925
##  8 2019-06-01 00:00:00 2bX4449667… 2015-07-28             1925
##  9 2019-06-01 00:00:00 2iL2964594… 2018-01-18             1925
## 10 2019-06-01 00:00:00 2m02498354… 2011-08-23             1925
## # … with 107 more rows, and 1 more variable: Billed_amount <dbl>
june[june$Billed_amount == 0,] # 164k have 0 billed amount 
## # A tibble: 164,587 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-06-01 00:00:00 0046765619… 2015-06-18             1958
##  2 2019-06-01 00:00:00 0046765619… 2015-06-18             2155
##  3 2019-06-01 00:00:00 0046765619… 2015-06-18            11813
##  4 2019-06-01 00:00:00 0046765619… 2015-06-18            26720
##  5 2019-06-01 00:00:00 0046765619… 2015-06-18               NA
##  6 2019-06-01 00:00:00 00F7217553… 2012-01-16            11813
##  7 2019-06-01 00:00:00 00F7217553… 2012-01-16            20597
##  8 2019-06-01 00:00:00 00F7217553… 2012-01-16               NA
##  9 2019-06-01 00:00:00 00R0968053… 2012-01-30               NA
## 10 2019-06-01 00:00:00 00S6153818… 2018-09-11            11813
## # … with 164,577 more rows, and 1 more variable: Billed_amount <dbl>
june[june$Billed_amount > 1000,] #51k have bill > 1000
## # A tibble: 51,264 x 5
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>                <dbl>
##  1 2019-06-01 00:00:00 0046765619… 2015-06-18               10
##  2 2019-06-01 00:00:00 00F7217553… 2012-01-16               10
##  3 2019-06-01 00:00:00 00F7217553… 2012-01-16              938
##  4 2019-06-01 00:00:00 00R0968053… 2012-01-30               10
##  5 2019-06-01 00:00:00 00R0968053… 2012-01-30              938
##  6 2019-06-01 00:00:00 00S6153818… 2018-09-11               10
##  7 2019-06-01 00:00:00 00S8151738… 2016-02-04               10
##  8 2019-06-01 00:00:00 00T0630891… 2018-02-14               10
##  9 2019-06-01 00:00:00 00T4181469… 2016-02-12               10
## 10 2019-06-01 00:00:00 00X7404388… 2017-06-01                8
## # … with 51,254 more rows, and 1 more variable: Billed_amount <dbl>
#Visualize smaller range of billed_amount
test_bill_june <- june[june$Billed_amount>0 & june$Billed_amount<1500,]
test_bill_june %>% ggplot(aes(Billed_amount)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4 Join data to make big data set

#Make adoption data set based on month 
adoption_april <- adoption[adoption$Month == 'April-19',]
adoption_may <- adoption[adoption$Month == 'May-19',]
adoption_june <- adoption[adoption$Month == 'June-19',]
#Inner join because some ID are unmatched
full_april <- april %>% inner_join(adoption_april, by = 'Customer_ID')%>% 
  select(-Month) %>% 
  mutate( age = time_length(difftime(as.Date(Billing_month), as.Date(Registration_date)), "years")) # Calculate age since registration

full_may <- may %>% inner_join(adoption_may, by = 'Customer_ID')%>% 
  select(-Month) %>% 
  mutate(age = time_length(difftime(as.Date(Billing_month), as.Date(Registration_date)), "years")) # Calculate age since registration

#Training and testing data
train_data <- rbind(full_april, full_may)
test_data <- june %>% inner_join(adoption_june, by = 'Customer_ID') %>% 
  select(-Month)%>% 
  mutate(age = time_length(difftime(as.Date(Billing_month), as.Date(Registration_date)), "years")) # Calculate age since registration

#Full data April May June 
full_data <- rbind(train_data, test_data)
#Check NA 
colSums(is.na(full_data))
##     Billing_month       Customer_ID Registration_date        product_id 
##                 0                 0                 0             67889 
##     Billed_amount     Customer_size          Geo_Code         Visualize 
##                 0                 0                 0                 0 
##             Alert            Report             Tools               age 
##                 0                 0                 0                 0

5 EDA full data set

5.1 Data treatment

5.1.1 Drop NA in product id

#There is no rows that have missing product_id and non-zero billed_amount 
full_data %>% filter(is.na(product_id) & Billed_amount != 0)
## # A tibble: 0 x 12
## # … with 12 variables: Billing_month <dttm>, Customer_ID <chr>,
## #   Registration_date <date>, product_id <dbl>, Billed_amount <dbl>,
## #   Customer_size <chr>, Geo_Code <chr>, Visualize <chr>, Alert <chr>,
## #   Report <chr>, Tools <fct>, age <dbl>
#Filter out missing product_id rows 
full_data<-full_data[!is.na(full_data$product_id),]

5.1.2 ID outlier treatment

full_data <- full_data %>% filter(Registration_date !='2014-01-17' & Billed_amount <= 90000)

5.1.3 Billed_amount outlier treatment

  • Cap values above 95 percentile of each product id
  • Delete negative billed amount
#Outlier

# productID with negative amount
full_data %>% filter(full_data$Billed_amount < 0)  %>% group_by(product_id) %>% summarise(count =n()) #373 negative bills
## # A tibble: 3 x 2
##   product_id count
##        <dbl> <int>
## 1       1925   368
## 2       3991     1
## 3      90691     4
#Delete negative bill amount
full_data$Billed_amount[full_data$Billed_amount <0] <- 0

#Cap values above 95 percentile of each product id 

full_data <- full_data %>%
  group_by(product_id) %>%
  mutate(Capped_billed_amount = squish(Billed_amount, quantile(Billed_amount, c(0.05, 0.95)))) %>% 
  ungroup()

#Visualize new billed amount distribution
ggplot(full_data, aes(y = Billed_amount))+geom_boxplot()

hist(full_data$Billed_amount)

hist(full_data$Capped_billed_amount)

summary(full_data$Billed_amount)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.00     3.15   414.15   125.39 89476.19
summary(full_data$Capped_billed_amount)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.00     3.28   326.23   116.00 80802.92
  • mean and median moved closer together after outlier treatment

5.1.4 Regroup product id

#Regroup product id

#find top 5 product 
count_top_prod<-full_data %>% group_by(product_id) %>% summarize(count = n()) 

count_top_prod[order(-count_top_prod$count),]
## # A tibble: 2,554 x 2
##    product_id  count
##         <dbl>  <int>
##  1         10 136084
##  2          8 128750
##  3      10298 121700
##  4      11813 104970
##  5       1958  92445
##  6        938  90604
##  7       2310  83195
##  8       3299  63549
##  9        548  51341
## 10      11921  49448
## # … with 2,544 more rows
#Most used product is 10, 8, 10298, 11813, 1958

#use product_id = 0 as other 
full_data <- full_data %>% mutate(product_id = case_when(product_id == 10 ~ 10, 
                                       product_id == 8 ~ 8,
                                       product_id == 10298 ~ 10298,
                                       product_id == 11813 ~ 11813,
                                       product_id == 1958 ~ 1958,
                                       TRUE ~ 0))
full_data$product_id <- as.factor(full_data$product_id)

5.1.5 Create age group

range(full_data$age)
## [1] -0.07671233 14.78630137
full_data[full_data$age<0,]
## # A tibble: 1,881 x 13
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>           <fct>     
##  1 2019-04-01 00:00:00 0fY8840097… 2019-04-19       10        
##  2 2019-04-01 00:00:00 0fY8840097… 2019-04-19       10298     
##  3 2019-04-01 00:00:00 0fY8840097… 2019-04-19       11813     
##  4 2019-04-01 00:00:00 0fY8840097… 2019-04-19       0         
##  5 2019-04-01 00:00:00 1An1379640… 2019-04-20       8         
##  6 2019-04-01 00:00:00 1An1379640… 2019-04-20       10        
##  7 2019-04-01 00:00:00 1An1379640… 2019-04-20       0         
##  8 2019-04-01 00:00:00 1An1379640… 2019-04-20       0         
##  9 2019-04-01 00:00:00 1An1379640… 2019-04-20       11813     
## 10 2019-04-01 00:00:00 1An1379640… 2019-04-20       0         
## # … with 1,871 more rows, and 9 more variables: Billed_amount <dbl>,
## #   Customer_size <chr>, Geo_Code <chr>, Visualize <chr>, Alert <chr>,
## #   Report <chr>, Tools <fct>, age <dbl>, Capped_billed_amount <dbl>
full_data <- full_data %>% mutate(age_group = case_when(age < 15 & age > 9 ~ 'Old customer (9-15)', 
                                           age <= 9 & age > 4 ~ 'Middle-aged customer (4-9)', 
                                           age <= 4 ~ 'Young customer (0-4)'))

5.1.6 Count product number

#Number of product each month is number of rows grouped by month and ID
full_data <-full_data %>% group_by(Billing_month, Customer_ID) %>%
  mutate(product_quant = n())

max(full_data$product_quant)
## [1] 149
full_data[(full_data$product_quant > 100),]
## # A tibble: 437 x 15
## # Groups:   Billing_month, Customer_ID [3]
##    Billing_month       Customer_ID Registration_da… product_id
##    <dttm>              <chr>       <date>           <fct>     
##  1 2019-04-01 00:00:00 s640820878… 2015-11-25       8         
##  2 2019-04-01 00:00:00 s640820878… 2015-11-25       10        
##  3 2019-04-01 00:00:00 s640820878… 2015-11-25       0         
##  4 2019-04-01 00:00:00 s640820878… 2015-11-25       0         
##  5 2019-04-01 00:00:00 s640820878… 2015-11-25       1958      
##  6 2019-04-01 00:00:00 s640820878… 2015-11-25       0         
##  7 2019-04-01 00:00:00 s640820878… 2015-11-25       0         
##  8 2019-04-01 00:00:00 s640820878… 2015-11-25       0         
##  9 2019-04-01 00:00:00 s640820878… 2015-11-25       0         
## 10 2019-04-01 00:00:00 s640820878… 2015-11-25       10298     
## # … with 427 more rows, and 11 more variables: Billed_amount <dbl>,
## #   Customer_size <chr>, Geo_Code <chr>, Visualize <chr>, Alert <chr>,
## #   Report <chr>, Tools <fct>, age <dbl>, Capped_billed_amount <dbl>,
## #   age_group <chr>, product_quant <int>
#Visualize distribution 
hist(full_data$product_quant)

5.2 Count data by groups

#Count by group
prop.table(table(full_data$Customer_size))*100
## 
##      Large        Mid      Small 
##  0.4903786  2.1048342 97.4047872
prop.table(table(full_data$product_id))*100
## 
##         0         8        10      1958     10298     11813 
## 62.704338  8.223007  8.691415  5.904278  7.772737  6.704225
prop.table(table(full_data$Geo_Code))*100
## 
##          AMER          APAC          CHNA          EMEA GEO-UNCLAIMED 
##  52.667607230  11.983874604   3.121293659  22.877713832   0.008111238 
##          GLBL          JAPN 
##   0.377779296   8.963620141
prop.table(table(full_data$age_group))*100
## 
## Middle-aged customer (4-9)        Old customer (9-15) 
##                  43.254101                   5.830447 
##       Young customer (0-4) 
##                  50.915452

5.3 By Customer Age

#Age distribution 
ggplot(full_data, aes(x = age, fill = age_group))+ geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Age, size and spending
full_data %>% 
  group_by(age_group, Customer_size) %>% 
  summarize(avg_bill_amount = mean(Billed_amount)) %>% 
  ggplot(aes(age_group, fill = Customer_size, y = avg_bill_amount))+geom_bar(stat = 'identity',position ='dodge')

5.4 By Product ID

#Visualize number of use per product
ggplot(full_data, aes(x = product_id)) + geom_bar()

#Visualize product id related to service use 
grid.arrange(ggplot(full_data, aes(x = product_id, fill = Visualize)) + geom_bar(position = 'dodge'),
             ggplot(full_data, aes(x = product_id, fill = Alert)) + geom_bar(position = 'dodge'),
             ggplot(full_data, aes(x = product_id, fill = Report)) + geom_bar(position = 'dodge'), ncol = 1)

#Visualize product id related to service use (Percentage per product)
grid.arrange(
  full_data %>% 
  group_by(product_id, Visualize) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(product_id,pct), y = pct, fill = Visualize)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent),
  full_data %>% 
  group_by(product_id, Alert) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(product_id,pct), y = pct, fill = Alert)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent),
    full_data %>% 
  group_by(product_id, Report) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(product_id,pct), y = pct, fill = Report)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent),ncol = 1)

# ggplot(full_data, aes(product_id, Customer_size))+geom_tile()

# Contingency table with customer size
prop.table(table(full_data$product_id, full_data$Customer_size), margin=2)*100
##        
##             Large       Mid     Small
##   0     76.413128 75.169924 62.365951
##   8      4.792915  5.037019  8.309122
##   10     4.792915  5.097706  8.788698
##   1958   4.597551  4.769996  5.935368
##   10298  4.766866  5.024882  7.847249
##   11813  4.636624  4.900473  6.753612

5.5 By Size

# Understand based on size 
grid.arrange(ggplot(full_data, aes(x = Customer_size, fill = Visualize)) + geom_bar(position = 'dodge'),
             ggplot(full_data, aes(x = Customer_size, fill = Alert)) + geom_bar(position = 'dodge'),
             ggplot(full_data, aes(x = Customer_size, fill = Report)) + geom_bar(position = 'dodge'), ncol = 1)

grid.arrange(
  full_data %>% 
  group_by(Customer_size, Visualize) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Customer_size,pct), y = pct, fill = Visualize)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent),
  full_data %>% 
  group_by(Customer_size, Alert) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Customer_size,pct), y = pct, fill = Alert)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent),
    full_data %>% 
  group_by(Customer_size, Report) %>% summarise(count=n()) %>% mutate(pct = round(count / sum(count),2)) %>%
  ggplot(aes(x = reorder(Customer_size,pct), y = pct, fill = Report)) + geom_bar(stat='identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent),ncol = 1)

5.6 By Geo Code

full_data %>% 
  group_by(Geo_Code) %>% 
  summarize(age = round(mean(age),1), 
            Billed_amount = mean(Billed_amount), 
            num_product = mean(product_quant), 
            num_unique_customer = length(unique(Customer_ID)))
## # A tibble: 7 x 5
##   Geo_Code        age Billed_amount num_product num_unique_customer
##   <chr>         <dbl>         <dbl>       <dbl>               <int>
## 1 AMER            4.7          440.       14.9                26623
## 2 APAC            3.7          376.       13.6                 6982
## 3 CHNA            2.6          650.       10.5                 2568
## 4 EMEA            4.3          357.       14.8                11728
## 5 GEO-UNCLAIMED   1.1         3424.        5.71                  24
## 6 GLBL            3.6          676.       12.5                  244
## 7 JAPN            3.5          365.       14.1                 4575

6 Business Insights

6.1 Analyze total AWS revenue by customer_age or customer_size or Geo_code

# Revenue by customers_age - young and medium group occupy more than 90%
pie_custage  <- full_data %>% group_by(age_group) %>%
    summarize(Total_exp = sum(Capped_billed_amount)) %>%
    mutate(perc = Total_exp / sum(Total_exp))

p <- plot_ly(pie_custage, labels = ~age_group, values = ~perc) %>%
   add_pie(hole = 0.6) %>%
  layout(title = 'revenue by customer_age group across 3 months',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
# Revenue by customers_size - more than 90% of business is driven by small customers
pie_custsize  <- full_data %>% group_by(Customer_size) %>%
    summarize(Total_exp = sum(Capped_billed_amount)) %>%
    mutate(perc = Total_exp / sum(Total_exp))

p <- plot_ly(pie_custsize, labels = ~Customer_size, values = ~perc) %>%
   add_pie(hole = 0.6) %>%
  layout(title = 'revenue by customer_size across 3 months',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
# Revenue by Geo_Code - 75% of revenue are from AMER & EMER
pie_geo <- full_data %>% group_by(Geo_Code) %>%
    summarize(Total_exp = sum(Capped_billed_amount)) %>%
    mutate(perc = Total_exp / sum(Total_exp))

 p <- plot_ly(pie_geo, labels = ~Geo_Code, values = ~perc) %>%
   add_pie(hole = 0.6) %>%
  layout(title = 'Revenue by Geo_Code accross 3 months',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
 p

6.2 Analyze customer_age group of top10 revenue customer across 3 months - 80% of top10 revenue customers are from young group

# top10 customers across 3 months
top10 <- full_data  %>%  group_by(Customer_ID,Billing_month) %>% mutate(Monthly_exp = sum(Capped_billed_amount))%>% ungroup() %>% select(-c(product_id,Billed_amount,Capped_billed_amount)) %>% distinct()  %>% arrange(-Monthly_exp) %>% group_by(Billing_month) %>%  mutate(rank = row_number())  %>% filter(rank<=10) %>% arrange(Billing_month,-Monthly_exp)
top10
## # A tibble: 30 x 14
## # Groups:   Billing_month [3]
##    Billing_month       Customer_ID Registration_da… Customer_size Geo_Code
##    <dttm>              <chr>       <date>           <chr>         <chr>   
##  1 2019-04-01 00:00:00 j590734619… 2017-02-24       Large         AMER    
##  2 2019-04-01 00:00:00 GG50789640… 2016-05-13       Large         AMER    
##  3 2019-04-01 00:00:00 Wha3477019… 2014-04-15       Mid           AMER    
##  4 2019-04-01 00:00:00 B024432886… 2019-03-07       Mid           APAC    
##  5 2019-04-01 00:00:00 CcA0911110… 2010-09-23       Mid           AMER    
##  6 2019-04-01 00:00:00 ivv5206204… 2018-01-15       Mid           APAC    
##  7 2019-04-01 00:00:00 EZT7940349… 2018-04-03       Mid           GEO-UNC…
##  8 2019-04-01 00:00:00 XRM4187246… 2018-08-23       Small         APAC    
##  9 2019-04-01 00:00:00 Pga0074070… 2016-11-28       Mid           AMER    
## 10 2019-04-01 00:00:00 Moa5298658… 2018-03-23       Large         AMER    
## # … with 20 more rows, and 9 more variables: Visualize <chr>, Alert <chr>,
## #   Report <chr>, Tools <fct>, age <dbl>, age_group <chr>,
## #   product_quant <int>, Monthly_exp <dbl>, rank <int>
# visualize the customer_age group for the customers
pie_top10  <- top10 %>% group_by(age_group) %>%
    summarize(count = n()) %>%
    mutate(perc = count / sum(count))

p <- plot_ly(pie_top10, labels = ~age_group, values = ~perc) %>%
   add_pie(hole = 0.6) %>%
  layout(title = 'customer_age group of top10 revenue customer across 3 months',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p

6.3 Analyze total Monthly Expenditure by Customer Age - large customers are likely to spend more

p <- full_data %>% group_by(Customer_ID,Billing_month) %>% mutate(Monthly_exp = sum(Capped_billed_amount))%>% ungroup() %>%
  select(age,Monthly_exp,Customer_size) %>% distinct() %>% mutate (Customer_age= as.factor(age)) 

ggplot(p, aes(Customer_age,Monthly_exp, color=Customer_size)) + geom_point(alpha=.5) + scale_size_area() 

6.4 Analyze Tools subscription by Customer size - High usage rate - 80% of large customers use at least one of the tools

#  Tools subscription of within same size customers

tools <- full_data %>% group_by(Customer_size,Tools) %>% 
  summarise(count = n ()) %>% group_by(Customer_size) %>% 
  mutate(count_g = sum (count)) %>% mutate(perc = count / count_g)
tools
## # A tibble: 24 x 5
## # Groups:   Customer_size [3]
##    Customer_size Tools count count_g   perc
##    <chr>         <fct> <int>   <int>  <dbl>
##  1 Large         A       178    7678 0.0232
##  2 Large         AR      169    7678 0.0220
##  3 Large         N      1538    7678 0.200 
##  4 Large         R       555    7678 0.0723
##  5 Large         V      2880    7678 0.375 
##  6 Large         VA      469    7678 0.0611
##  7 Large         VAR     444    7678 0.0578
##  8 Large         VR     1445    7678 0.188 
##  9 Mid           A       677   32956 0.0205
## 10 Mid           AR      433   32956 0.0131
## # … with 14 more rows
p <- ggplot(data = tools,aes(x=Customer_size,y=perc,fill=Tools)) + geom_bar(stat="identity",position='fill') 
p

#  Analyze Tools subscription of large size customers 

p <- tools %>% filter(Customer_size == 'Large') %>% plot_ly(labels = ~Tools, values = ~perc) %>%
   add_pie(hole = 0.6) %>%
  layout(title = 'Product Expenditures by Customer_size accross 3 months',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p

6.5 Analyze Tools subscription by Customer age group - the percentage of each tools is almost the same across age groups!

#  Tools subscription of within same customer age group 
tools <- full_data %>% group_by(age_group,Tools) %>% 
  summarise(count = n ()) %>% group_by(age_group) %>% 
  mutate(count_g = sum (count)) %>% mutate(perc = count / count_g)
tools
## # A tibble: 24 x 5
## # Groups:   age_group [3]
##    age_group                  Tools  count count_g    perc
##    <chr>                      <fct>  <int>   <int>   <dbl>
##  1 Middle-aged customer (4-9) A      20558  677242 0.0304 
##  2 Middle-aged customer (4-9) AR      2988  677242 0.00441
##  3 Middle-aged customer (4-9) N     405220  677242 0.598  
##  4 Middle-aged customer (4-9) R      26005  677242 0.0384 
##  5 Middle-aged customer (4-9) V     170601  677242 0.252  
##  6 Middle-aged customer (4-9) VA     20418  677242 0.0301 
##  7 Middle-aged customer (4-9) VAR     6157  677242 0.00909
##  8 Middle-aged customer (4-9) VR     25295  677242 0.0374 
##  9 Old customer (9-15)        A       3272   91289 0.0358 
## 10 Old customer (9-15)        AR       535   91289 0.00586
## # … with 14 more rows
p <- ggplot(data = tools,aes(x=age_group,y=perc,fill=Tools)) + geom_bar(stat="identity",position='fill') 

p

6.6 Analyze Tools usage by Geo_code - High potential market - about 80% of Chinese customers haven’t used any tool

#  Tools subscription of within same Geo_code customer 
tools <- full_data %>% group_by(Geo_Code,Tools) %>% 
  summarise(count = n ()) %>% group_by(Geo_Code) %>% 
  mutate(count_g = sum (count)) %>% mutate(perc = count / count_g)
tools
## # A tibble: 49 x 5
## # Groups:   Geo_Code [7]
##    Geo_Code Tools  count count_g    perc
##    <chr>    <fct>  <int>   <int>   <dbl>
##  1 AMER     A      29686  824632 0.0360 
##  2 AMER     AR      3712  824632 0.00450
##  3 AMER     N     503992  824632 0.611  
##  4 AMER     R      30148  824632 0.0366 
##  5 AMER     V     193970  824632 0.235  
##  6 AMER     VA     26738  824632 0.0324 
##  7 AMER     VAR     7258  824632 0.00880
##  8 AMER     VR     29128  824632 0.0353 
##  9 APAC     A       7594  187635 0.0405 
## 10 APAC     AR       959  187635 0.00511
## # … with 39 more rows
p <- ggplot(data = tools,aes(x=Geo_Code,y=perc,fill=Tools)) + geom_bar(stat="identity",position='fill') 

p

#  Analyze Tools subscription of customers in Chinese Market

p <- tools %>% filter(Geo_Code == 'CHNA') %>% plot_ly(labels = ~Tools, values = ~perc) %>%
   add_pie(hole = 0.6) %>%
    layout(title = 'Tools subscription in China accross 3 months',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
# save(full_data, file = 'full_data.RData')
# write.csv(full_data, 'full_data.csv')